<?php

/**
 * 数据处理
 * @author 暮雨秋晨
 * @copyright 2014
 */

class Analyze
{
    private $Resource = null; //目标字符串
    private $BaseUrl = null; //基础链接
    private $Analyze_Tmp = null; //字符串处理缓冲

    public function __construct($Resource = '', $BaseUrl = '')
    {
        if (!empty($Resource) && !empty($BaseUrl)) {
            $this->Resource = $Resource;
            $this->BaseUrl = $BaseUrl;
        } else {
            return false;
        }
    }

    public function fetchText()
    {
        $search = array(
            "'<script[^>]*?>.*?</script>'si", // 去掉 javascript
            "'<[\/\!]*?[^<>]*?>'si", // 去掉 HTML 标记
            "'([\r\n])[\s]+'", // 去掉空白字符
            "'&(quot|#34);'i", // 替换 HTML 实体
            "'&(amp|#38);'i",
            "'&(lt|#60);'i",
            "'&(gt|#62);'i",
            "'&(nbsp|#160);'i",
            "'&(iexcl|#161);'i",
            "'&(cent|#162);'i",
            "'&(pound|#163);'i",
            "'&(copy|#169);'i",
            "'&#(\d+);'e"); // 作为 PHP 代码运行
        $replace = array(
            "",
            "",
            "\\1",
            "\"",
            "&",
            "<",
            ">",
            " ",
            chr(161),
            chr(162),
            chr(163),
            chr(169),
            "chr(\\1)");
        return preg_replace($search, $replace, $this->Resource);
    }

    public function fetchHtml()
    {
        return $this->Resource;
    }

    public function findTitle()
    {
        preg_match('!<title>(.*)<\/title>!Uis', $this->Resource, $match);
        if (isset($match[1]) && !empty($match[1])) {
            return $match[1];
        } else {
            return null;
        }
    }

    public function findLink()
    {
        preg_match_all('!<a.*href=[\'|"](.*)[\'|"].*>(.*)<\/a>!Uis', $this->Resource, $matchs);
        if (!empty($matchs) && isset($matchs[1])) {
            $matchs = array_filter($matchs[1]);
            foreach ($matchs as $key => $match) {
                $matchs[$key] = self::__Parse_Url($match, $this->BaseUrl);
            }
            $matchs = array_unique($matchs);
            $tmp = array_filter($matchs);
            return $tmp;
        } else {
            return false;
        }
    }

    private static function __Parse_Url($url, $baseUrl)
    {
        $url = trim($url);
        if (!preg_match('![A-Za-z0-9]!Uis', $url)) {
            return null;
        }
        if (!$url or $url == '/' or $url == './' or $url == './' or strspn($url, '#') or
            stripos($url, '@')) {
            return null;
        }
        if (strtolower(substr($url, 0, 7)) == 'http://' or strtolower(substr($url, 0, 8)) ==
            'https://') {
            if (strstr($url, $baseUrl)) {
                return $url;
            } else {
                return null;
            }
        } else {
            return 'http://' . $baseUrl . '/' . trim($url, '/');
        }
    }
}

?>